##read in data file: dat<-read.csv(file='W:/R Course (sponsored by BERD) 2020/Part II/Topic6/dat.csv',header=T) ##check variables: summary(dat) ##analysis after removing missing values: mean(dat$y1) mean(dat$y1,na.rm=T) var(dat$y1) var(dat$y1,na.rm=T) cor(dat$y1,dat$y2) cor(dat$y1,dat$y2,use="complete.obs") lm(y2~y1+x1+x2,data=dat) ##check missing pattern: library(naniar) vis_miss(dat) gg_miss_upset(dat) gg_miss_var(dat) ##check missing mechanism: par(mfrow=c(3,3)) boxplot(dat$x1 ~ dat$r1, col="orange", main="Distribution of x1", ylab="x1", xlab="r1") boxplot(dat$x1 ~ dat$r2, col="red", main="Distribution of x1", ylab="x1", xlab="r2") boxplot(dat$x1 ~ dat$r3, col="blue", main="Distribution of x1", ylab="x1", xlab="r3") boxplot(dat$x2 ~ dat$r1, col="orange", main="Distribution of x2", ylab="x2", xlab="r1") boxplot(dat$x2 ~ dat$r2, col="red", main="Distribution of x2", ylab="x2", xlab="r2") boxplot(dat$x2 ~ dat$r3, col="blue", main="Distribution of x2", ylab="x2", xlab="r3") boxplot(dat$x3 ~ dat$r1, col="orange", main="Distribution of x3", ylab="x3", xlab="r1") boxplot(dat$x3 ~ dat$r2, col="red", main="Distribution of x3", ylab="x3", xlab="r2") boxplot(dat$x3 ~ dat$r3, col="blue", main="Distribution of x3", ylab="x3", xlab="r3") library(dplyr) library(ggplot2) library(tidyr) library(scales) ggplot(dat, aes(x= x4, group=r1)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", vjust = -.5) + labs(y = "Percent", fill="x4") + facet_grid(~r1) + scale_y_continuous(labels = scales::percent) ggplot(dat, aes(x= x4, group=r2)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", vjust = -.5) + labs(y = "Percent", fill="x4") + facet_grid(~r2) + scale_y_continuous(labels = scales::percent) ggplot(dat, aes(x= x4, group=r3)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", vjust = -.5) + labs(y = "Percent", fill="x4") + facet_grid(~r3) + scale_y_continuous(labels = scales::percent) ggplot(dat, aes(x= x5, group=r1)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", vjust = -.5) + labs(y = "Percent", fill="x5") + facet_grid(~r1) + scale_y_continuous(labels = scales::percent) ggplot(dat, aes(x= x5, group=r2)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", vjust = -.5) + labs(y = "Percent", fill="x5") + facet_grid(~r2) + scale_y_continuous(labels = scales::percent) ggplot(dat, aes(x= x5, group=r3)) + geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") + geom_text(aes( label = scales::percent(..prop..), y= ..prop.. ), stat= "count", vjust = -.5) + labs(y = "Percent", fill="x5") + facet_grid(~r3) + scale_y_continuous(labels = scales::percent) fit1<-glm(r1~x1+x2+x3+x4+as.character(x5),data=dat,family=binomial(link = "logit")) summary(fit1) fit2<-glm(r2~x1+x2+x3+x4+as.character(x5),data=dat,family=binomial(link = "logit")) summary(fit2) fit3<-glm(r3~x1+x2+x3+x4+as.character(x5),data=dat,family=binomial(link = "logit")) summary(fit3) ##imputation model check: pairs(dat[,c(2,3,4,7,8)]) par(mfrow=c(2,3)) boxplot(dat$y1 ~ dat$x4, col="orange", main="Distribution of y1", ylab="y1", xlab="x4") boxplot(dat$y1 ~ dat$x5, col="blue", main="Distribution of y1", ylab="y1", xlab="x5") boxplot(dat$y1 ~ dat$y3, col="red", main="Distribution of y1", ylab="y1", xlab="y3") boxplot(dat$y2 ~ dat$x4, col="orange", main="Distribution of y2", ylab="y2", xlab="x4") boxplot(dat$y2 ~ dat$x5, col="blue", main="Distribution of y2", ylab="y2", xlab="x5") boxplot(dat$y2 ~ dat$y3, col="red", main="Distribution of y2", ylab="y2", xlab="y3") fit1_m<-glm(y1~x1+x2+x3+x4+as.character(x5),data=dat) summary(fit1_m) fit2_m<-glm(y2~x1+x2+x3+x4+as.character(x5)+y1,data=dat) summary(fit2_m) fit3_m<-glm(y3~x1+x2+x3+x4+as.character(x5)+y1+y2,data=dat,family=binomial(link = "logit")) summary(fit3_m) ##imputation by using 'mice' package: library(mice) dat$x4<-as.factor(dat$x4) dat$x5<-as.factor(dat$x5) dat$y3<-as.factor(dat$y3) dat2<-dat[,c(2:9)] datm1<-mice(dat2) datm2<-mice(dat2, meth = c("pmm","pmm","pmm","logreg","polyreg","pmm","pmm","logreg")) PM<-rbind(c(0,rep(1,7)),c(1,0,rep(1,6)),c(1,1,0,rep(1,5)),c(rep(1,3),0,rep(1,4)), c(rep(1,4),0,rep(1,3)),c(rep(1,4),rep(0,4)),c(rep(1,4),0,1,0,0),c(1,1,1,0,0,1,1,0)) datm3<-mice(dat2,meth=c("pmm","pmm","pmm","logreg","polyreg","pmm","pmm","logreg"), predictorMatrix=PM) ##output: attributes(datm3) datm3$predictorMatrix ##pooled analysis by using Rubin's rule: fit <- with(data = datm3, exp = lm(y1 ~ x1 + x2 + x3 + x4)) summary(pool(fit)) fit <- with(data = datm3, exp = glm(y3 ~ x1 + x2 + x3 + x4 + y1 + y2,family=binomial(link = "logit"))) summary(pool(fit)) fit <- with(data = datm3, exp = lm(y1~1)) summary(pool(fit)) ##Propensity score weighting: #estimate propensity scores for responding: pfit1<-glm(r1~x1+x2+x3+x4+x5,data=dat,family=binomial(link = "logit")) ep1<-pfit1$fitted.values ew1<-1/ep1 pfit2<-glm(r2~x1+x2+x3+x4+x5,data=dat,family=binomial(link = "logit")) ep2<-pfit2$fitted.values ew2<-1/ep2 pfit3<-glm(r3~x1+x2+x3+x4+x5,data=dat,family=binomial(link = "logit")) ep3<-pfit3$fitted.values ew3<-1/ep3 dat3<-cbind(dat,ew1,ew2,ew3) ##weighted analysis: a1<-glm(y1~x1+x2+x3+x4,data=dat3,weights=ew1) summary(a1) a2<-glm(y2~x1+x2+x3+x4,data=dat3,weights=ew2) summary(a2) a3<-glm(y3~x1+x2+x3+x4,data=dat3,weights=ew3,family=binomial(link = "logit")) summary(a3)